Food Production
Data Cleaning and Merge Datasets
We created a new dataset that contains the information for the Healthcare Index, Oil Production and Oil price. Index and Production variables have been transformed in order to have a closer scale to the Price variable.
{r}
# Read csv
df_food_companies <- read.csv('../data/viz_food_companies.csv')
# Filter data
df_food_companies <- df_food_companies %>% dplyr::select('date', 'TSN')
# Rename columns
names(df_food_companies) <- c('date', 'adjusted')
# Change data type
df_food_companies$date <- as.Date(df_food_companies$date)
# Create a sequence of dates from start_date to end_date
start_date <- as.Date(min(df_food_companies$date))
end_date <- as.Date(max(df_food_companies$date))
# Create date sequence
date_range <- seq(start_date, end_date, by = "1 day")
# Create a dataset with the date range
date_dataset <- data.frame(Date = date_range)
# Merge dataframes
df_food_companies <- merge(df_food_companies, date_dataset, by.x = "date", by.y = "Date", all = TRUE)
# Check for missing values
# is.na(df_oil_price$adjusted)
# Extract rows with missing values
df_na_rows <- df_food_companies[which(rowSums(is.na(df_food_companies)) > 0),]
# Extract columns with missing values
df_na_cols <- df_food_companies[, which(colSums(is.na(df_food_companies)) > 0)]
# Modify data
imputed_time_series <- na_ma(df_food_companies, k = 4, weighting = "exponential")
# Add modified data
df_food_companies <- data.frame(imputed_time_series)
# Change data type
df_food_companies$date <- as.Date(df_food_companies$date,format = "%m/%d/%y")
# Create Date separte terms columns
df_food_companies_monthly <- df_food_companies %>%
mutate(Year = lubridate::year(date),
Month = lubridate::month(date),
Day = lubridate::day(date))
# Group by Year Month and get the maximum day
df_food_companies_monthly <- df_food_companies_monthly %>%
group_by(Year, Month) %>%
summarize(Max_Day = max(Day))
# Create Date
df_food_companies_monthly <- df_food_companies_monthly %>%
mutate(date = make_date(Year, Month, Max_Day))
# Merge datasets
df_food_companies_monthly <- merge(df_food_companies_monthly, df_food_companies, by = "date", all.x = TRUE)
# Keep relevant columns
df_food_companies_monthly <- df_food_companies_monthly %>% dplyr::select("date", "adjusted")
# Rename columns
names(df_food_companies_monthly) <- c('Date', 'Index')
# Save ts as a new file
write.csv(df_food_companies_monthly, '../data/df_food_companies_monthly.csv', row.names = FALSE)
# ---------------------------------------------------------------------------------------------------------------------------------
# Import dataset
df_food_companies_monthly <- as.data.frame(read_csv('../data/df_food_companies_monthly.csv'))
# Filter information
df_food_companies_monthly <- df_food_companies_monthly %>% filter(year(Date) >= 2000 & year(Date) <= 2022)
# Create Date
df_food_companies_monthly <- df_food_companies_monthly %>%
mutate(date2 = make_date(year(Date), month(Date), 01))
# Import dataset
df_oil_production <- as.data.frame(read_csv('../data/viz_us_oil_production.csv'))
# Filter information
df_oil_production <- df_oil_production %>% filter(year(Date) >= 2000 & year(Date) <= 2022)
# Create Date
df_oil_production <- df_oil_production %>%
mutate(date2 = make_date(year(Date), month(Date), 01))
# Import dataset
df_oil_price <- as.data.frame(read_csv('../data/df_oil_price_monthly.csv'))
# Create Date
df_oil_price <- df_oil_price %>%
mutate(date2 = make_date(year(date), month(date), 01))
df_food_companies_monthly$date2 <- as.Date(df_food_companies_monthly$date2)
df_oil_production$date2 <- as.Date(df_oil_production$date2)
df_oil_price$date2 <- as.Date(df_oil_price$date2)
# List of minimum dates
dates <- c(min(df_food_companies_monthly$date2), min(df_oil_production$date2),min(df_oil_price$date2))
min_date <- max(dates)
# Filter starting date
df_food_companies_monthly <- df_food_companies_monthly %>% filter(date2 >= min_date)
# Filter starting date
df_oil_production <- df_oil_production %>% filter(date2 >= min_date)
# Filter starting date
df_oil_price <- df_oil_price %>% filter(date2 >= min_date)
# Keep relevant columns
df_food_companies_monthly <- df_food_companies_monthly %>% dplyr::select('date2', 'Index')
# Convert to Log
df_food_companies_monthly$Index <- log(df_food_companies_monthly$Index)
# Keep relevant columns
df_oil_production <- df_oil_production %>% dplyr::select('date2', 'Production')
# Convert to Log
df_oil_production$Production <- log(df_oil_production$Production)
# Keep relevant columns
df_oil_price <- df_oil_price %>% dplyr::select('date2', 'adjusted')
# List of dataframes
df_list <- list(df_food_companies_monthly, df_oil_production, df_oil_price)
# Combine datasets
dd <- merge_dataframes(df_list)
# Rename columns
names(dd) <- c('DATE', 'Index', 'Production', 'Price')
# Order by Date sort ascending
dd <- dd %>% dplyr::arrange(DATE)
# # Create the time series object
# dd.ts <- ts(dd,star=decimal_date(min_date),frequency = 12)
# Show table
knitr::kable(head(dd))| DATE | Index | Production | Price |
|---|---|---|---|
| 2000-08-01 | 1.873880 | 8.663715 | 1.0327613 |
| 2000-09-01 | 1.942007 | 8.658345 | 0.9740746 |
| 2000-10-01 | 2.070833 | 8.667164 | 1.0199688 |
| 2000-11-01 | 2.298103 | 8.671287 | 1.0546022 |
| 2000-12-01 | 2.157624 | 8.675051 | 0.8429909 |
| 2001-01-01 | 2.269116 | 8.665441 | 0.8954759 |
Plot
In the plot below we can observe the time series for the variables Index, Production and Price from 2020 to 2022.
{r}
plot <- ggplot(dd, aes(x = DATE)) +
geom_line(aes(y = Index, color = "Index"), alpha = 0.7) +
geom_line(aes(y = Production, color = "Production"), alpha = 0.7) +
geom_line(aes(y = Price, color = "Price"), alpha = 0.7) +
labs(title = "Food Production ~ Oil Price + Oil Production",
x = "Date",
y = "Value",
color = "Variables") + # Set legend title
scale_color_manual(values = c(Index = "red", Production = "blue", Price = "green")) +
theme_minimal()
plot %>% ggplotly()Linear Models
In FIT 1 we can observe the first linear model where the Index is the predictor variable and Oil Production and Oil Price are the independent variables. The variable Oil Price has a p-value higher than the significance level of 0.05, so it is not significant.
In FIT 2 we can see the second linear model where the Index is the predictor variable and Oil Production is the independent variable. The model has good R-squared and RMSE values.
{r}
fit_1 <- lm(Index ~ ., data = dd)
summary(fit_1)
Call:
lm(formula = Index ~ ., data = dd)
Residuals:
Min 1Q Median 3Q Max
-0.89749 -0.14304 -0.00064 0.14419 0.56334
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.018e+01 8.989e-01 -11.325 <2e-16 ***
DATE 1.784e-04 1.830e-05 9.748 <2e-16 ***
Production 1.195e+00 1.265e-01 9.449 <2e-16 ***
Price -3.572e-02 2.482e-02 -1.439 0.151
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.236 on 265 degrees of freedom
Multiple R-squared: 0.9176, Adjusted R-squared: 0.9166
F-statistic: 983.1 on 3 and 265 DF, p-value: < 2.2e-16
{r}
fit_2 <- lm(Index ~ Production, data = dd)
summary(fit_2)
Call:
lm(formula = Index ~ Production, data = dd)
Residuals:
Min 1Q Median 3Q Max
-0.84333 -0.19257 0.02899 0.20665 0.78099
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -17.78558 0.49156 -36.18 <2e-16 ***
Production 2.35006 0.05532 42.48 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2939 on 267 degrees of freedom
Multiple R-squared: 0.8711, Adjusted R-squared: 0.8706
F-statistic: 1805 on 1 and 267 DF, p-value: < 2.2e-16
R^2: 0.8711293
RMSE: 0.2928089
Stationarity
The ACF plot shows high autocorrelation in the lags, so the time series is not stationary. Evaluating the results of the Augmented Dickey-Fuller test, the p-value is higher than the significance level of 0.05. Therefore, we cannot reject the null hypothesis that the time series is stationary.
Time Series, ACF Plot and PACF Plot
{r}
lm.residuals <- residuals(fit_2)
lm.residuals %>% ggtsdisplay()Augmented Dickey-Fuller Test
{r}
adf_test <- adf.test(lm.residuals)
print(adf_test)
Augmented Dickey-Fuller Test
data: lm.residuals
Dickey-Fuller = -3.2812, Lag order = 6, p-value = 0.07473
alternative hypothesis: stationary
Differencing
We apply the first difference to the residuals of the linear model to obtain a stationary time series. We can observe the volatility of the residuals and two main volatility clusters in the data points near 100 and near 250.
In the acf plot, we can observe that the differenced residuals look stationary.
{r}
lm.residuals %>% diff() %>% ggtsdisplay()Model Selection
Based on the ACF and PACF plot for the first difference on residuals, we define:
\(p = 2\)
\(d = 1\)
\(q = 2\)
Next we will run the model diagnostics on the models that have the minimum AIC, minimum BIC and the auto.arima() results.
Model Parameters
{r}
xt <- lm.residuals
p_value <- 2
d_value <- 1
q_value <- 2
i <- 1
temp <- data.frame()
rows <- (p_value+1)*(d_value+1)*(q_value+1)
ls <- matrix(rep(NA,6*rows),nrow=rows)
for (p in 0:p_value+1)
{
for(q in 0:q_value+1)
{
for(d in 0:d_value)
{
if(p-1+d+q-1<=8) #usual threshold
{
model<- Arima(xt,order=c(p-1,d,q-1),include.drift=TRUE)
ls[i,]= c(p-1,d,q-1,model$aic,model$bic,model$aicc)
i=i+1
#print(i)
}
}
}
}
temp <- as.data.frame(ls)
temp <- na.omit(temp)
names(temp) <- c("p","d","q","AIC","BIC","AICc")
#temp
knitr::kable(temp)| p | d | q | AIC | BIC | AICc |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 87.47201 | 98.25615 | 87.56258 |
| 0 | 1 | 0 | -366.00017 | -358.81820 | -365.95489 |
| 0 | 0 | 1 | -148.01258 | -133.63374 | -147.86107 |
| 0 | 1 | 1 | -364.14123 | -353.36826 | -364.05032 |
| 0 | 0 | 2 | -239.24974 | -221.27618 | -239.02160 |
| 0 | 1 | 2 | -369.21414 | -354.85019 | -369.06205 |
| 1 | 0 | 0 | -372.95980 | -358.58095 | -372.80828 |
| 1 | 1 | 0 | -364.09521 | -353.32225 | -364.00431 |
| 1 | 0 | 1 | -371.24575 | -353.27220 | -371.01762 |
| 1 | 1 | 1 | -369.82786 | -355.46391 | -369.67577 |
| 1 | 0 | 2 | -373.39050 | -351.82223 | -373.06989 |
| 1 | 1 | 2 | -367.82209 | -349.86716 | -367.59308 |
| 2 | 0 | 0 | -371.16434 | -353.19078 | -370.93620 |
| 2 | 1 | 0 | -369.50610 | -355.14215 | -369.35401 |
| 2 | 0 | 1 | -372.15506 | -350.58680 | -371.83445 |
| 2 | 1 | 1 | -368.27615 | -350.32122 | -368.04715 |
| 2 | 0 | 2 | -372.37779 | -347.21481 | -371.94867 |
| 2 | 1 | 2 | -376.84682 | -355.30090 | -376.52498 |
Model Summary
{r}
temp[which.min(temp$AIC),] p d q AIC BIC AICc
18 2 1 2 -376.8468 -355.3009 -376.525
{r}
temp[which.min(temp$BIC),] p d q AIC BIC AICc
2 0 1 0 -366.0002 -358.8182 -365.9549
{r}
temp[which.min(temp$AICc),] p d q AIC BIC AICc
18 2 1 2 -376.8468 -355.3009 -376.525
auto.arima()
{r}
# Assign the exogenous variable
fit_auto_arima <- auto.arima(xt)
summary(fit_auto_arima)Series: xt
ARIMA(3,1,1)
Coefficients:
ar1 ar2 ar3 ma1
0.1451 -0.1595 0.0851 -0.1576
s.e. 0.6026 0.0620 0.1112 0.6028
sigma^2 = 0.01448: log likelihood = 189.19
AIC=-368.39 AICc=-368.16 BIC=-350.43
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.001975082 0.1192094 0.08364693 -6.130162 93.34814 0.9969181
ACF1
Training set -0.0006301736
Model Diagnostics
Comparing the results of the model diagnostics, we can observe that the ARIMA(0, 1, 2) model is the best. The lag values in the ACF plot are within the confidence bands, so the residuals are stationary. The standard residuals plotted in the Normal Q-Q plot show the similarity to normality. The p-values for the Ljung-Box statistic confirm that the residuals are stationary.
{r}
AIC <- temp[which.min(temp$AIC),]
p1 <- AIC$p
d1 <- AIC$d
q1 <- AIC$q
model_output <- capture.output(sarima(xt, p1, d1, q1)){r}
BIC <- temp[which.min(temp$BIC),]
p2 <- BIC$p
d2 <- BIC$d
q2 <- BIC$q
model_output <- capture.output(sarima(xt, p2, d2, q2)){r}
p3 <- 3
d3 <- 1
q3 <- 1
model_output <- capture.output(sarima(xt, p3, d3, q3))Residuals
We plot the residuals and the residuals squared of the arima model using the ACF and PACF plot to determine the parameters for the GARCH model.
The values that will be used to assess the model are the following:
\(p = 1\)
\(q = 2\)
Residuals ACF
{r}
arima_model <- Arima(xt,order=c(p1, d1, q1),include.drift = TRUE)
arima.res <- residuals(arima_model)
acf(arima.res)Residuals^2 ACF
{r}
acf(arima.res^2)Residuals^2 PACF
{r}
pacf(arima.res^2)GARCH Model
After running the GARCH models possibilities we observe that the model with the lowest AIC is GARCH(2, 1). The summary of the model shows that the beta1 component is not significant, therefore we decide to remove that component and test the GARCH(2, 0) model. For the second model, all the components are significant and the resulting AIC is lower compared to the AIC value of the GARCH(2, 1) model.
{r}
model <- list() ## set counter
cc <- 1
for (p in 1:2) {
for (q in 1:2) {
model[[cc]] <- garch(arima.res,order=c(q,p),trace=F)
cc <- cc + 1
}
}
GARCH_AIC <- sapply(model, AIC)
model[[which(GARCH_AIC == min(GARCH_AIC))]]
Call:
garch(x = arima.res, order = c(q, p), trace = F)
Coefficient(s):
a0 a1 b1
9.722e-03 1.993e-01 1.416e-15
{r}
summary(garchFit(~garch(1,1), arima.res, trace = F))
Title:
GARCH Modelling
Call:
garchFit(formula = ~garch(1, 1), data = arima.res, trace = F)
Mean and Variance Equation:
data ~ garch(1, 1)
<environment: 0x1490ef270>
[data = arima.res]
Conditional Distribution:
norm
Coefficient(s):
mu omega alpha1 beta1
0.00047630 0.00934392 0.26985736 0.00000001
Std. Errors:
based on Hessian
Error Analysis:
Estimate Std. Error t value Pr(>|t|)
mu 4.763e-04 6.824e-03 0.070 0.94436
omega 9.344e-03 2.080e-03 4.492 7.07e-06 ***
alpha1 2.699e-01 8.707e-02 3.099 0.00194 **
beta1 1.000e-08 1.659e-01 0.000 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Log Likelihood:
215.4242 normalized: 0.8008335
Description:
Fri Dec 8 02:24:55 2023 by user:
Standardised Residuals Tests:
Statistic p-Value
Jarque-Bera Test R Chi^2 113.3319241 0.000000e+00
Shapiro-Wilk Test R W 0.9631007 2.223111e-06
Ljung-Box Test R Q(10) 5.6069862 8.471316e-01
Ljung-Box Test R Q(15) 9.0034933 8.773347e-01
Ljung-Box Test R Q(20) 12.4745097 8.987765e-01
Ljung-Box Test R^2 Q(10) 6.0190325 8.136617e-01
Ljung-Box Test R^2 Q(15) 6.6005782 9.678013e-01
Ljung-Box Test R^2 Q(20) 7.6521804 9.939302e-01
LM Arch Test R TR^2 7.0327635 8.554408e-01
Information Criterion Statistics:
AIC BIC SIC HQIC
-1.571927 -1.518474 -1.572361 -1.550460
{r}
summary(garchFit(~garch(1,0), arima.res, trace = F))
Title:
GARCH Modelling
Call:
garchFit(formula = ~garch(1, 0), data = arima.res, trace = F)
Mean and Variance Equation:
data ~ garch(1, 0)
<environment: 0x1371b7040>
[data = arima.res]
Conditional Distribution:
norm
Coefficient(s):
mu omega alpha1
0.0004763 0.0093439 0.2698574
Std. Errors:
based on Hessian
Error Analysis:
Estimate Std. Error t value Pr(>|t|)
mu 0.0004763 0.0066199 0.072 0.94264
omega 0.0093439 0.0009952 9.389 < 2e-16 ***
alpha1 0.2698574 0.0837979 3.220 0.00128 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Log Likelihood:
215.4242 normalized: 0.8008335
Description:
Fri Dec 8 02:24:55 2023 by user:
Standardised Residuals Tests:
Statistic p-Value
Jarque-Bera Test R Chi^2 113.3319236 0.000000e+00
Shapiro-Wilk Test R W 0.9631007 2.223112e-06
Ljung-Box Test R Q(10) 5.6069862 8.471316e-01
Ljung-Box Test R Q(15) 9.0034933 8.773347e-01
Ljung-Box Test R Q(20) 12.4745097 8.987765e-01
Ljung-Box Test R^2 Q(10) 6.0190326 8.136617e-01
Ljung-Box Test R^2 Q(15) 6.6005783 9.678013e-01
Ljung-Box Test R^2 Q(20) 7.6521806 9.939302e-01
LM Arch Test R TR^2 7.0327636 8.554408e-01
Information Criterion Statistics:
AIC BIC SIC HQIC
-1.579362 -1.539272 -1.579607 -1.563262
Model Equation
\(Y_{t} = \mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times \varepsilon_{t-1}^2 + \alpha_2 \times \varepsilon_{t-2}^2 + \varepsilon_t\)
\(\mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times 0.62634 + \alpha_2 \times 0.36708\)
Forecast Plot
Below we can observe the Forcast Plot with the predictions from the ARIMA(0, 1, 2) + GARCH(2, 0) model.
{r}
final.fit <- garchFit(~garch(1,0), arima.res, trace = F)
plot <- predict(final.fit, n.ahead = 100, plot=TRUE)